library("cluster")
library("dendextend")
##
## ---------------------
## Welcome to dendextend version 1.12.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
source("functions.R")
## Loading required package: ggplot2
# Get data with Stylo
# data = stylo::load.corpus.and.parse(corpus.dir = "~/dev/dh-meier/output/kraken-nospace/tokenized/boudams", features = "w", ngram.size = 1, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/kraken_nospace_expanded_words.csv")
data = read.csv("data/kraken_nospace_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
nwords = colSums(data)
summary(nwords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 296 2274 3598 5051 6824 19139
boxplot(nwords)
boxplot(nwords)$out
## 05_Ano_Leg-A_Ap_NA_Vie_Jacques 29_Wau_Leg-C_Co_Ev_Vie_Martin
## 14476 14639
## 31_Wau_Leg-C_Co_Ev_Dia_Martin3 34_Wau_Leg-C_Co_Ev_Vie_Martial
## 19139 15407
head(sort(nwords), n = 15)
## 62_Ano_Leg-N_NA_NA_NA_Index 03_Ano_Leg-A_Ap_NA_Mar_Jean
## 296 301
## 61_Ano_Leg-B_NA_NA_NA_Jugement 30_Wau_Leg-C_Co_Ev_Tra_Martin2
## 412 726
## 08_Ano_Leg-A_Ap_NA_Vie_Philippe 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie
## 1017 1307
## 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur 32_Wau_Leg-C_Co_Ev_Vie_Brice
## 1375 1394
## 60_Ano_Leg-B_NA_NA_NA_Antechriste 54_Ano_Leg-C_Vi_NA_Vie_Pelagie
## 1493 1524
## 20_Ano_Leg-B_Ma_Ho_Vie_Felicite 11_Ano_Leg-A_Ap_NA_Vie_Marc
## 1695 1856
## 23_Ano_Leg-B_Ma_Ho_Vie_Sixte 53_Ano_Leg-C_Vi_NA_Vie_Marguerite
## 1953 1961
## 35_Wau_Leg-C_Co_Ev_Vie_Nicolas
## 1974
toKeep = colnames(data)[nwords > 1000]
toKeep = toKeep[grep("Bestiaire", toKeep, invert = TRUE)]
df = as.data.frame(nwords)
ggplot(df, aes(x="", y=nwords)) + geom_violin() + geom_boxplot(width=0.3) + theme(axis.text.y = element_text(size = rel(1.4)), axis.title = element_text(size = rel(1.4))) + xlab("Est. length in words of corpus texts") + scale_y_continuous(breaks=c(0, 2500, 5000, 7500, 10000, 12500, 15000, 17500))
# Get data with Stylo
#data = stylo::load.corpus.and.parse(corpus.dir = "~/dev/dh-meier/output/kraken-nospace/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
# Get freq lists
#data = stylo::make.table.of.frequencies(corpus = data, features = unique(sort(unlist(data))), relative = FALSE)
# Write it
#write.csv(as.matrix(data), "data/kraken_nospace_raw_char3grams.csv")
data = read.csv("data/kraken_nospace_raw_char3grams.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
Raw3grSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHRaw3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotRaw3grams = cahPlotCol(myCAH, k = 9, main = "Characters 3-grams from raw data (Transkr)")
somCAH = somCluster(d)
somplotRaw3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Characters 3-grams from raw data (Transkr)")
data = read.csv("data/kraken_nospace_expanded_words.csv", header = TRUE, row.names = 1)
data = t(data)
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
WordsSave = d
d = d[select,]
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHForms = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotForms = cahPlotCol(myCAH, k = 9, main = "Expanded word forms (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotForms = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded word forms (Transkr/Boudams/Pie)")
# Creating affixes database from all words
dataAffs = countAffixes(data)
d = dataAffs
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
AffixesSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHAffs = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotAffixes = cahPlotCol(myCAH, k = 9, main = "Expanded affixes (Transkr/Boudams/Pie)")
somCAH = somCluster(d)
somplotAffixes = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Expanded affixes (Transkr/Boudams/Pie)")
#labels(sort(rowSums(data), decreasing = TRUE)[1:300])
# Avec ou sans pronoms ?
functionWords = source("functionWords.R")$value
d = relativeFreqs(data)
d = d[functionWords,]
# save data for robustness checks
FWSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFW = myCAH
# barplot(sort(myCAH$height))
plotFW = cahPlotCol(myCAH, k = 8, main = "Function words with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFW = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words")
data = read.csv("data/kraken_nospace_pos3-gr.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
# save data for robustness checks
d = d[select,]
POS3grSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHPOS3gr = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotPOS3grams = cahPlotCol(myCAH, k = 9, main = "POS 3-grams (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotPOS3grams = cahPlotCol(somCAH, k = 9, main = "SOM BASED - POS 3-grams")
data = read.csv("data/kraken_nospace_lemmas.csv", header = TRUE, row.names = 1, sep = ";")
#remove total freq
data = data[, -1]
colnames(data) = gsub("^X", "", colnames(data))
colnames(data) = gsub(".decolumnized", "", colnames(data))
colnames(data) = gsub("Leg.", "Leg-", colnames(data))
data = data[, toKeep]
data = data[rowSums(data) > 0, ]
data = as.matrix(data)
d = data
# Selection based on Moisl 2011
select = selection(d, z = 1.645)
select = select[,4]
# Normalisations
d = relativeFreqs(d)
d = d[select,]
LemmasSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHLemmas = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotLemmas = cahPlotCol(myCAH, k = 9, main = "Lemmas (Transkr/Boudams/Pie/Pie)")
somCAH = somCluster(d)
somplotLemmas = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Lemmas")
# Find function words
#rownames(data)[1:250]
functionLemmas = source("functionLemmas.R")$value
d = relativeFreqs(data)
d = d[functionLemmas,]
FLSave = d
d = normalisations(d)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHFL = myCAH
# barplot(sort(myCAH$height))
data = stylo::load.corpus.and.parse(corpus.dir = "~/dev/dh-meier/output/kraken-nospace/raw/", features = "c", ngram.size = 3, preserve.case = FALSE)
## loading 00_Ano_Leg-A_Ap_Ev_Dis_Pierre1.txt ...
## loading 01_Ano_Leg-A_Ap_NA_Vie_Pierre2.txt ...
## loading 02_Ano_Leg-A_Ap_NA_Pas_Paul.txt ...
## loading 03_Ano_Leg-A_Ap_NA_Mar_Jean.txt ...
## loading 04_Ano_Leg-A_Ap_NA_Vie_Jean_Ev.txt ...
## loading 05_Ano_Leg-A_Ap_NA_Vie_Jacques.txt ...
## loading 06_Ano_Leg-A_Ap_NA_Vie_Matthieu.txt ...
## loading 07_Ano_Leg-A_Ap_NA_Vie_SimonJude.txt ...
## loading 08_Ano_Leg-A_Ap_NA_Vie_Philippe.txt ...
## loading 09_Ano_Leg-A_Ap_NA_Vie_JacquesMineur.txt ...
## loading 10_Ano_Leg-A_Ap_NA_Vie_Barthelemy.txt ...
## loading 11_Ano_Leg-A_Ap_NA_Vie_Marc.txt ...
## loading 12_Ano_Leg-A_Ma_Ho_Vie_Longin.txt ...
## loading 13_Ano_Leg-B_Ma_Ho_Vie_Sebastien.txt ...
## loading 14_Ano_Leg-B_Ma_Ho_Vie_Vincent.txt ...
## loading 15_Ano_Leg-B_Ma_Ho_Vie_Georges.txt ...
## loading 16_Ano_Leg-B_Ma_Ho_Vie_Christophe.txt ...
## loading 17_Ano_Leg-B_Ma_Ho_Vie_Agathe.txt ...
## loading 18_Ano_Leg-B_Ma_Ho_Vie_Luce.txt ...
## loading 19_Ano_Leg-B_Ma_Ho_Vie_Agnes.txt ...
## loading 20_Ano_Leg-B_Ma_Ho_Vie_Felicite.txt ...
## loading 21_Ano_Leg-B_Ma_Ho_Vie_Christine.txt ...
## loading 22_Ano_Leg-B_Ma_Ho_Vie_Cecile.txt ...
## loading 23_Ano_Leg-B_Ma_Ho_Vie_Sixte.txt ...
## loading 24_Ano_Leg-B_Ma_Ho_Vie_Laurent.txt ...
## loading 25_Ano_Leg-B_Ma_Ho_Vie_Hippolyte.txt ...
## loading 26_Ano_Leg-B_Ma_Ev_Vie_Lambert.txt ...
## loading 27_Ano_Leg-B_Ma_Ho_Vie_Pantaleon.txt ...
## loading 28_Ano_Leg-B_Ma_Ho_Vie_Clement.txt ...
## loading 29_Wau_Leg-C_Co_Ev_Vie_Martin.txt ...
## loading 30_Wau_Leg-C_Co_Ev_Tra_Martin2.txt ...
## loading 31_Wau_Leg-C_Co_Ev_Dia_Martin3.txt ...
## loading 32_Wau_Leg-C_Co_Ev_Vie_Brice.txt ...
## loading 33_Wau_Leg-C_Co_Er_Vie_Gilles.txt ...
## loading 34_Wau_Leg-C_Co_Ev_Vie_Martial.txt ...
## loading 35_Wau_Leg-C_Co_Ev_Vie_Nicolas.txt ...
## loading 36_Wau_Leg-C_Co_Ev_Mir_Nicolas2.txt ...
## loading 37_Wau_Leg-C_Co_Ev_Tra_Nicolas3.txt ...
## loading 38_Wau_Leg-C_Co_Ev_Vie_Jerome.txt ...
## loading 39_Wau_Leg-C_Co_Ev_Vie_Benoit.txt ...
## loading 40_Wau_Leg-C_Co_Er_Vie_Alexis.txt ...
## loading 41_Ano_Leg-C_Vi_NA_Vie_Irene.txt ...
## loading 42_Ano_Leg-B_Vi_NA_Ass_NotreDame.txt ...
## loading 43_Ano_Leg-C_Vi_NA_Vie_Catherine.txt ...
## loading 44_Ano_Leg-C_Ap_NA_Vie_Andre.txt ...
## loading 45_Ano_Leg-C_Ap_NA_Pas_Andre2.txt ...
## loading 46_Ano_Leg-B_Co_NA_Pur_Patrice.txt ...
## loading 47_Ano_Leg-C_Co_er_Vie_PaulErmite.txt ...
## loading 48_Ano_Leg-C_Co_ev_Tra_Benoit2.txt ...
## loading 49_Ano_Leg-C_NA_NA_Vie_Maur.txt ...
## loading 50_Ano_Leg-C_NA_NA_Vie_Placide.txt ...
## loading 51_Ano_Leg-C_Ma_ho_Vie_Eustache.txt ...
## loading 52_Ano_Leg-C_Co_NA_Vie_Forsin.txt ...
## loading 53_Ano_Leg-C_Vi_NA_Vie_Marguerite.txt ...
## loading 54_Ano_Leg-C_Vi_NA_Vie_Pelagie.txt ...
## loading 55_Ano_Leg-C_Co_NA_Vie_Simeon.txt ...
## loading 56_Ano_Leg-C_Co_NA_Vie_Mamertin.txt ...
## loading 57_Ano_Leg-C_Vi_NA_Vie_Julien.txt ...
## loading 58_Ano_Leg-C_Vi_NA_Vie_MarieEgyptienne.txt ...
## loading 59_Ano_Leg-C_Vi_NA_Vie_Euphrasie.txt ...
## loading 60_Ano_Leg-B_NA_NA_NA_Antechriste.txt ...
## loading 61_Ano_Leg-B_NA_NA_NA_Jugement.txt ...
## loading 62_Ano_Leg-N_NA_NA_NA_Index.txt ...
## loading 63_Ric_Leg-N_NA_NA_NA_Bestiaire.txt ...
## loading 64_Ano_Leg-N_NA_NA_NA_Bestiaire2.txt ...
## slicing input text into tokens...
##
## turning words into features, e.g. char n-grams (if applicable)...
plotFL = cahPlotCol(myCAH, k = 8, main = "Function Lemmas with pronouns and auxiliaries\n(Transkr/Boudams/Pie)")
#plotCol(myCAH, main = "toto")
somCAH = somCluster(d)
somplotFL = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Function words (lemmas)")
data = rbind(AffixesSave, POS3grSave, FLSave)
d = normalisations(data)
myCAH = cluster::agnes(t(d), metric = "manhattan", method="ward")
# Save
CAHGlob = myCAH
#TODO: heights
# barplot(sort(myCAH$height))
plotGlob = cahPlotCol(myCAH, k = 9, main = "Affixes + POS 3- grams + Function words (lemmas)")
somCAH = somCluster(d)
somplotGlob = cahPlotCol(somCAH, k = 9, main = "SOM BASED - Affixes + POS 3- grams + Function words (lemmas)")
#featlabel = "features of ME ±2σ with conf. > 90%"
#A = cahPlotCol(CAHLemma, main = "A", xlab = paste( ncol(CAHLemma$data), featlabel), k = 6, lrect = -12)
# B = cahPlotCol(CAHRhyme, main = "B", xlab = paste( ncol(CAHRhyme$data), featlabel), k = 6, lrect = -7, ylab = " ")
# C = cahPlotCol(CAHAllWords, main = "C", xlab = paste( ncol(CAHAllWords$data), featlabel), k = 6, ylab = " ")
# D = cahPlotCol(CAHAffs, main = "D", xlab = paste( ncol(CAHAffs$data), featlabel), k = 6, ylab = " ")
# E = cahPlotCol(CAHPOS3gr, main = "E", xlab = paste( ncol(CAHPOS3gr$data), featlabel), k = 6, lrect = -12 , ylab = " ")
# F = cahPlotCol(CAHmfw, main = "F", k = 6, lrect = -5, ylab = " ")
# gridExtra::grid.arrange(A, B, C, D, E, F, ncol = 2)
gridExtra::grid.arrange(plotRaw3grams, plotForms, plotAffixes, plotFW, plotLemmas, plotFL, plotPOS3grams, plotGlob, ncol = 2)
gridExtra::grid.arrange(somplotRaw3grams, somplotForms, somplotAffixes, somplotFW, somplotLemmas, somplotFL, somplotPOS3grams, somplotGlob, ncol = 2)
cahList = list(raw3grams = CAHRaw3gr, Forms = CAHForms, Affs = CAHAffs, FW = CAHFW, Lemmas = CAHLemmas, FunctLemm = CAHFL, POS3gr = CAHPOS3gr, Global = CAHGlob)
compareHC(cahList, k = 9)
## raw3grams Forms Affs FW Lemmas FunctLemm
## raw3grams 1.0000000 0.7966102 0.8474576 0.8474576 0.6949153 0.6949153
## Forms 0.7627119 1.0000000 0.7457627 0.8305085 0.6610169 0.6779661
## Affs 0.8644068 0.7457627 1.0000000 0.7457627 0.6779661 0.7118644
## FW 0.8474576 0.8305085 0.7457627 1.0000000 0.6949153 0.7118644
## Lemmas 0.6949153 0.6271186 0.6271186 0.6779661 1.0000000 0.6949153
## FunctLemm 0.6440678 0.6440678 0.6949153 0.6779661 0.6779661 1.0000000
## POS3gr 0.5932203 0.6610169 0.6440678 0.6440678 0.5932203 0.7118644
## Global 0.8474576 0.7796610 0.8305085 0.7627119 0.6779661 0.6779661
## POS3gr Global
## raw3grams 0.6440678 0.8644068
## Forms 0.6610169 0.7627119
## Affs 0.6779661 0.8474576
## FW 0.6779661 0.7796610
## Lemmas 0.6610169 0.6779661
## FunctLemm 0.6949153 0.6440678
## POS3gr 1.0000000 0.6271186
## Global 0.6779661 1.0000000